* START_322211_BOXES_2002_07.SAS -- produces BOXES datasets;

%include "ASMimplibs.sas";


*OPTIONS obs=5000;run;



* PART 1: RAW DATA FOR PRODUCERS OF PRODUCT;

%macro chk(y,p);


  /* Compute PQS totals by product and imputation flag. */

    data boxes_prods&y;
     set cmf.cmf&y.prod (keep = survu_id NAICSPC pqs pv pqs_f);
     if NAICSPC in (&p);
     if pqs_f in (' V','RV',' J','RJ')
     then pqs_imp=1;
     else pqs_imp=0;
    run;

    data boxes_estabs&y (keep = survu_id ar);
     set cmf.cmf&y;
     if NAICS_NEW = "32221100";
    run;

         proc sort data=boxes_prods&y; by survu_id; run;

         data boxes_estabs&y;
           merge  boxes_prods&y boxes_estabs&y ;
         by survu_id;
         run;

         proc sort data=boxes_estabs&y; by NAICSPC pqs_imp; run;

          proc summary data=boxes_estabs&y nway;
            by NAICSPC;
            var pqs;
          output out=totpqs&y sum=totpqs;run;

          proc summary data=boxes_estabs&y nway;
            by NAICSPC pqs_imp;
            var pqs;
          output out=totpqs_by_imp&y sum=totpqs_by_imp;run;
         
          data pqs_tots&y;
           merge totpqs&y totpqs_by_imp&y;
            by NAICSPC;
           run;

          data pqs_tots&y;
           set pqs_tots&y;
            pqs_pct = totpqs_by_imp/totpqs;
          run;

          proc print data=pqs_tots&y;
           var NAICSPC pqs_imp pqs_pct totpqs_by_imp totpqs;
          title1 "Boxes products";
          title2 "Pct. of PQS total that is imputed/non-imputed, &y";
           run;

          /* Now try excluding AR cases: */

          data nonar_estabs&y; set boxes_estabs&y; if ar ne 1; run;

          proc summary data=nonar_estabs&y nway;
            by NAICSPC;
            var pqs;
          output out=totpqs&y sum=totpqs;run;

          proc summary data=nonar_estabs&y nway;
            by NAICSPC pqs_imp;
            var pqs;
          output out=totpqs_by_imp&y sum=totpqs_by_imp;run;
         
          data pqs_tots&y;
           merge totpqs&y totpqs_by_imp&y;
            by NAICSPC;
           run;

          data pqs_tots&y;
           set pqs_tots&y;
            pqs_pct = totpqs_by_imp/totpqs;
          run;

          proc print data=pqs_tots&y;
           var NAICSPC pqs_imp pqs_pct totpqs_by_imp totpqs;
          title "Pct. of non-AR PQS total that is imputed/non-imputed, &y";
           run;



 data chk&y;
   set cmf.cmf&y.prod (keep = survu_id NAICSPC pv);
   if NAICSPC in (&p);
 run;

 title "&y -- # of product trailer records with boxes products";run;
 proc freq data=chk&y;table NAICSPC;run;

 title "&y -- product trailer records with boxes products";run;
 proc print data= chk&y; run;

 proc sort data=chk&y nodupkey out=est&y;by survu_id;

 title "&y -- # of estabs that have boxes  in product trailer file";run;
 title2 "ignore the actual products in this case";run;
 proc freq data=est&y;table NAICSPC;run;

 *now get all of the product records for these guys;
 * these other products will be used in creating the PPSR measure;
 * so dont want to include NAICSPC STARTING WITH 9 or 7, balancing codes;
 * admin records, or pv<0;

  data fhs.boxes&y;
   merge est&y(in=one keep=survu_id )
         cmf.cmf&y.prod(keep=survu_id NAICSPC NAICSPC_COL pqs pv pqs_f pv_f );
   by survu_id;
   if one=1 and pv ne . and NAICSPC ne "";
   if NAICSPC in (&p) then cflag=1;else cflag=0; *create a flag for boxes data;
   if pqs>0 then pflag=1;else pflag=0;
   NAICSPC1=substr(NAICSPC,1,1);
   NAICSPC7=substr(NAICSPC,7,4);
   NAICSPC8=substr(NAICSPC,1,8);

   if NAICSPC1='9' or NAICSPC8 in ('77000000' , '00093000') OR NAICSPC in ('0009998900' , '0009998000' ) OR NAICSPC7 in ('WYWW' , '0YWW' , '000-' ) or pv<0
   then dflag=0;
   else dflag=1;
  run;

  title "&y -- checking nonlegit product codes"; 
  data xx&y;
    set fhs.boxes&y;
    if dflag=0;
  run;

  proc freq data=xx&y;table NAICSPC;run;

  data xx&y;
     set xx&y;
     if pv<0;
  run;

  title "&y -- Product trailer records with pv<0 (EXCLUDING PV = .)";run; 
  title2 "For plants that produce boxes";run; 
  proc print data=xx&y;var survu_id NAICSPC pv;run;

  
  title "&y -- # of product trailer records (and boxes vs not boxes)";run;
  title2 "&y -- for box-producing plants";run; 
  title3 "cflag=1: boxes;  cflag=0: not boxes";run;
  proc freq data=fhs.boxes&y;table cflag;run;

  title "&y -- Edit/impute flags, legit box products"; 
  title2 "with pv > 0 ";run; 
   proc freq data=fhs.boxes&y; table pqs_f pv_f;
   where cflag=1 and pv > 0 and dflag=1;
  run;


  title "&y -- # of legit products vs. non-legit or pv<0";run;
  title2 "dflag=1: legit product codes"; run;
  proc freq data=fhs.boxes&y;table dflag;run;

  title "&y -- # of physical products";run;
  title2 "pflag=1: product trailer record has physical product data (pqs>0)";
  proc freq data=fhs.boxes&y;table pflag pflag*dflag;run;



 data box_ourprods&y;
  set fhs.boxes&y;
  if dflag=1;   *keep legitimate product codes only;
  if cflag=1;   *and keep only the FHS boxes products;
  if pv ne . then pqspvrat = pqs/pv;
  price = 1/pqspvrat;
 run;

 proc sort data=box_ourprods&y; by NAICSPC; run;


 data box_ourprods&y;
  set box_ourprods&y;
    rounded_price0001 = round(price,0.0001);
    rounded_price002 = round(price,0.002);
   run;
 
 proc univariate data=box_ourprods&y;
  var rounded_price0001;
  by NAICSPC;
  output out=mode_prices&y mode= modeprice;
  title "price by product, boxes industry, year=&y ";
 run;

 proc univariate data=box_ourprods&y;
  var rounded_price002;
  by NAICSPC;
  output out=mode_prices002&y mode= modeprice002;
  title "price (rounded by 0.002) by product, boxes industry, year=&y ";
 run;

 data box_ourprods&y;
   merge box_ourprods&y mode_prices&y mode_prices002&y;
   by NAICSPC; 
   /* Identify the imputed PQS items. */
   if pqs = round(pv/modeprice,1)
   then pqs_imp_priceflag=1;
   else pqs_imp_priceflag=0;
   if rounded_price002^= . and rounded_price002 = modeprice002
   then pqs_imp_FHS002=1;
   else pqs_imp_FHS002=0;
   if pqs_f in (' V','RV') 
   then pqs_imp_Vflag=1;
   else pqs_imp_Vflag=0;
   if mpqspvrat^= . and pqs ne 0 and pqs_f in (' V','RV') 
   then Nonzero_pqs_imp_Vflag=1;
   else Nonzero_pqs_imp_Vflag=0;
   if pqs_f in (' V','RV',' J','RJ')  
   then pqs_imp_anyflag=1;
   else pqs_imp_anyflag=0;
   pqs_f1 = substr(pqs_F,1,1);
 run;

 title "PQS imputation flag by product, boxes in &y";
 proc freq data=box_ourprods&y;
  tables pqs_imp_FHS002  pqs_imp_priceflag Nonzero_pqs_imp_Vflag pqs_imp_Vflag pqs_imp_anyflag;
 by NAICSPC;
 run;
 


 proc univariate data=box_ourprods&y;
  var pqs;
  where pqs_imp_Vflag=1; 
  title "&y -- distribution of PQS (all products), pqs_f = RV or V ";
 run;

 proc univariate data=box_ourprods&y;
  var rounded_price0001;
  by NAICSPC ;
  where pqs_imp_Vflag=1; 
  title "&y -- rounded price ratio by product where pqs_f = RV or V";
 run;

 proc univariate data=box_ourprods&y;
  var rounded_price0001;
  by NAICSPC ;
  where pqs_imp_Vflag=0; 
  title "&y -- rounded price ratio by product where pqs_f ^= RV or V";
 run;



 proc univariate data=box_ourprods&y;
  var rounded_price0001;
  by NAICSPC ;
  where pqs_f1 = 'R'; 
  title "&y -- price ratio by product where pqs_f1 = R ";
 run;


 data boxes_FHS002_id_fails&y (keep = PPN PQS PV pqs_f NAICSPC price modeprice modeprice002 rounded_price002);
  set box_ourprods&y;
   if pqs_imp_Vflag=1 and pqs_imp_FHS002=0;
   run;

  proc sort data=boxes_FHS002_id_fails&y; by NAICSPC; run;


  proc print data=boxes_FHS002_id_fails&y;
   var NAICSPC pqs pv price modeprice rounded_price002 modeprice002;
   title "Boxes plants with ratio-imputed PQS, where FHS ident fails, year=&y";
  run;


 data boxes_price0001_id_fails&y (keep = PPN PQS PV pqs_f NAICSPC price modeprice rounded_price0001);
  set box_ourprods&y;
   if pqs_imp_Vflag=1 and pqs_imp_priceflag=0;
   run;

  data boxes_price0001_id_fails&y;
   set boxes_price0001_id_fails&y;
   pqs_impute = pv/modeprice;
   rounded_pqs_impute = round(pv/modeprice,1);
   pqs_less_pqs_impute = pqs - rounded_pqs_impute;
  run;

  proc sort data=boxes_price0001_id_fails&y; by NAICSPC; run;

  proc print data=boxes_price0001_id_fails&y;
   var NAICSPC pqs rounded_pqs_impute pqs_impute pqs_less_pqs_impute modeprice rounded_price0001;
   title "Boxes prods with (V) imputed PQS, where price ident fails, year=&y";
  run;



 data boxes_false_positives&y (keep = PPN PQS PV pqs_f NAICSPC price modeprice rounded_price0001);
  set box_ourprods&y;
   if pqs_imp_Vflag=0 and pqs_imp_priceflag=1;
   run;

  data boxes_false_positives&y;
   set boxes_false_positives&y;
   pqs_impute = pv/modeprice;
   rounded_pqs_impute = round(pv/modeprice,1);
   pqs_less_pqs_impute = pqs - rounded_pqs_impute;
  run;

  proc sort data=boxes_false_positives&y; by NAICSPC; run;

  proc print data=boxes_false_positives&y;
   var NAICSPC pqs rounded_pqs_impute pqs_impute pqs_less_pqs_impute modeprice rounded_price0001;
   title "Boxes with non-imputed PQS, where imp price flag=1, year=&y";
  run;


 data boxes_correct_price_ids&y (keep = PPN PQS PV pqs_f NAICSPC price modeprice rounded_price0001);
  set box_ourprods&y;
   if pqs_imp_Vflag=1 and pqs_imp_priceflag=1;
   run;

  data boxes_correct_price_ids&y;
   set boxes_correct_price_ids&y;
   pqs_impute = pv/modeprice;
   rounded_pqs_impute = round(pv/modeprice,1);
   pqs_less_pqs_impute = pqs - rounded_pqs_impute;
  run;

  proc sort data=boxes_correct_price_ids&y; by NAICSPC; run;

  proc print data=boxes_correct_price_ids&y;
   var NAICSPC pqs rounded_pqs_impute pqs_impute pqs_less_pqs_impute modeprice rounded_price0001;
   title "Boxes where price flag correctly ids PQS, year=&y";
  run;


%mend;


%chk(2002,%STR('3222110111' , '3222110114' , '3222110221' , '3222110341' , '3222110345' , '3222110431' , '3222110433' , '3222110435' , '3222110437' , '3222110551' , '3222110661' , '3222110665'));
%chk(2007,%STR('3222110113' , '3222110221' , '3222110343' , '3222110436'));


* PART 2: CLEANED UP DATA;
* Exclusion based on low ppsr;
* ppsr calculated only on legit codes;
* and using total product for numerator;

%macro chk2(y,p);

* Creating PPSR:
  * 1) Pick only legitimate codes;
         data good&y;
           set fhs.boxes&y;
           if dflag=1;     *legitimate product codes only;
         run;
         proc sort data=good&y;by SURVU_ID;run;

  * 2) Sum up all of the products that the estab has;
          proc summary data=good&y nway;
            by survu_id;
            var pv;
          output out=totpv&y sum=totpv;run;

  * 3) Create single product of interest (if needed);
   * 3.1) Construct plant-level impute flag ;

         data cc&y;
           set good&y;
           if NAICSPC in (&p);
           if pqs_f in (' V','RV',' J','RJ')
           then pqs_imp=1;
           else pqs_imp=0;
           if pv_f in (' H',' L',' J',' B','RL','RB','RH')
           then pv_imp=1;
           else pv_imp=0;
           prodcount=1;
           pqs_valimp=pqs*pqs_imp;
           pv_valimp=pv*pv_imp;
         run;


         data test&y;
           set cc&y;
           if pv>0 then pqspvrat=pqs/pv;
           if pqspvrat ne .;
         run;
         proc sort data=test&y; by pqs_imp; run;
        

         proc sort data=test&y; by NAICSPC; run;
 


/* Look at scatterplots of ln(PQS) vs ln(PV): */

 data test&y; 
  set test&y;
   ln_PV = log(pv);
   if pqs_f in (' V','RV')
   then do;
     ln_pqs = log(pqs);
     ln_pqs_imp= ln_pqs;
   end;
   else do;
     ln_pqs = log(pqs);
     ln_pqs_nonimp=ln_pqs;
   end;
 run; 

 filename out '/projects/arts110/imputation_and_productivity/tr00617/programs/ASM_imputation/replicate_FHS/boxes';

 goptions device=gif gsfname=out cback=white border htitle=18pt ;

 axis1 label=("Ln(PV)") /* value=NONE */;
 axis2 label=(angle=90 "Ln(PQS)") /* value=NONE */;

 symbol1 interpol=none value=plus color=black;
 symbol2 interpol=none value=circle color=red;

 symbol3 interpol=r1 value=none color=black;

 legend1 frame label=none repeat=1 value=("Non-imputed" "Imputed");

 ods graphics on;

 title1 "&y boxes manufacturing plants";
 title2 "Census Bureau's imputations vs. non-imputed data";
 title3 "By product";

 proc sort data=test&y; by NAICSPC; RUN;

 proc gplot data=test&y;
   plot ln_pqs*ln_pv=pqs_imp  / haxis=axis1 vaxis=axis2 legend=legend1 name="boxes&y._lnPQS_on_lnPV_CBdata" ;
   by NAICSPC;
 run;


         

         proc summary data=cc&y nway;
            by survu_id;
            var prodcount;
          output out=prodcnt&y sum=;
         run;


         proc summary data=cc&y nway;
            by survu_id;
            var pv pqs pqs_imp pv_imp pqs_valimp pv_valimp;
          output out=prod&y sum=;
         run;

         data prod&y;
            merge prodcnt&y prod&y;
            by survu_id;
         run;

         data prod&y;
          set prod&y;
          pqs_imprat = pqs_imp/prodcount;
          label pqs_imprat="Fraction of PQS records (our products only) with imputed data";
          pv_imprat = pv_imp/prodcount;
          label pv_imprat="Fraction of PV records (our products only) with imputed data";
          if pqs > 0 then pqs_valimpr = pqs_valimp/pqs; else pqs_valimpr = .;
          label pqs_valimpr = "Proportion of PQS VALUE (our products only) that is imputed"; 
          if pv > 0 then pv_valimpr = pv_valimp/pv; else pv_valimpr = .;
          label pv_valimpr = "Proportion of PV VALUE (our products only) that is imputed"; 
         run;

  * 4) Create the price and ppsr;
         data chk1&y;
            merge prod&y totpv&y;
            by survu_id;
            if totpv>0 then ppsr1=pv/totpv;
            label ppsr1="Product Specialization Ratio (version 1)";
            if ppsr1=1 then ttflag=1;else ttflag=0;
            label ttflag="TTFLAG=1 when there is only 1 product produced";
            if pqs>0 then price=pv/pqs;
            label price="Constructed Price=PV/PQS";
          run;

          title "&y -- Price properties";run;
          proc univariate data=chk1&y;var price;run;

  * 5) Properties of PPSR;
         title "&y -- PPSR1 properties";run;
         proc univariate data=chk1&y;var ppsr1;run;
         proc freq data=chk1&y;table ttflag;run;
         
           * now just looking at PPSR for multi-product estabs;
             data multi&y;
               set chk1&y;
               if ttflag=0;
             run;

             title2 "Just for multi-product estabs";run;
             proc univariate data=multi&y;var ppsr1;run;
             
   * 6) Applying exlusion rule based on PPSR & PQS;
   * have changed this to be just flagged;

           data big&y;
             set chk1&y;
             if ppsr1>0.50 then ppsrflag=1;else ppsrflag=0;
             if pqs=0 then phyflag=0;else phyflag=1;
             label phyflag="PHYFLAG=1 when has physical data";
           run;

           title "&y -- estabs with ppsr1>0.50 & physical data";run;
           proc freq data=big&y;table ppsrflag phyflag;run;

           data fhs.boxesf&y (keep=survu_id price pqs pv ppsr1 phyflag pqs_imprat pv_imprat pqs_valimpr pv_valimpr);
             set big&y;
           ** if phyflag=1;
           run;
     %mend;

%chk2(2002,%STR('3222110111' , '3222110114' , '3222110221' , '3222110341' , '3222110345' , '3222110431' , '3222110433' , '3222110435' , '3222110437' , '3222110551' , '3222110661' , '3222110665'));
%chk2(2007,%STR('3222110113' , '3222110221' , '3222110343' , '3222110436'));


  * PART 3: PROPERTIES OF THE FINAL DATASET;
%macro chk3(y,i);

 * 1) Their Industries;
   proc sort data=fhs.boxesf&y out=boxesf&y;by survu_id;run;

   data ind&y;
     merge boxesf&y (in=one) cmf.cmf&y (keep=survu_id NAICS_NEW);
     by survu_id;
     if one=1;
   run;

  title "&y -- NAICS_NEW of final boxes dataset";run;
  proc freq data=ind&y;table NAICS_NEW;run;

  * 1.1) Imputations of product-trailer data for our products;

    title "&y -- Imputation rates for boxes products, prod trailer data"; run;
    proc freq data=fhs.boxesf&y; 
     tables pqs_imprat pv_imprat; 
    run;

    title "&y -- Proportions of total PQS and PV value imputed, boxes products"; run;
    proc means data=fhs.boxesf&y N min p10 q1 median mean q3 p90 max; 
     var pqs_valimpr pv_valimpr; 
    run;

    data testcorr;
     set fhs.boxesf&y;
     if price >0 then lprice = log(price);
     if pqs>0 then lqphy=log(pqs);
     if pqs_imprat > 0.5 then pqs_imp=1; else pqs_imp=0;
    run;

    proc sort data = testcorr; by pqs_imp; run;

    title "&y -- Correlations, by PQS imputation rate";
    title2 "pqs_imp=1 if PQS imp rate > 0.5"; run;
    proc corr data=testcorr;
      var lprice lqphy;
     by pqs_imp;
    run;


  * 2) Coverage of the Industry;
    proc sort data=fhs.boxesf&y out=boxesf&y;by survu_id;run;
    data boxesf&y;
     set boxesf&y;
     ourflag=1;
    run;

   data tot&y;
     merge boxesf&y (keep=survu_id ourflag phyflag) cmf.cmf&y (keep=survu_id NAICS_NEW tvs ar tabbed);
     by survu_id;
   run;

   proc freq data=tot&y;table ourflag;run;

   data tot&y;
     set tot&y;
     if ourflag=. then ourflag=0;
     if NAICS_NEW in ("&i");
   run;

  title "&y-- our sample vs total sample";
  proc freq data=tot&y;table ourflag ourflag*phyflag ar*ourflag tabbed*ourflag; run;

  proc summary data=tot&y;
    var tvs;
    class ourflag ;
  output out=sum&y sum=;run;

  title "&y -- total tvs";run;
  proc print data=sum&y;run;

   data nonar&y;
    set tot&y;
    if ar=0;
   run;

  proc summary data=nonar&y;
    var tvs;
    class ourflag;
  output out=sum2&y sum=;run;

  title "&y -- non-ar tvs";run;
  proc print data=sum2&y;run;

  proc summary data=nonar&y;
    var tvs;
    class phyflag;
  output out=sum3&y sum=;run;

  title "&y -- non-ar tvs";run;
  proc print data=sum3&y;run;


%mend;


%chk3(2002,32221100);
%chk3(2007,32221100);

